
# SET WORKING DIRECTORY

library(pscl)
library(nnet)
library(caret)
library(car)
library(RANN)

#######################################################################
##################       BOOTSTRAP     ################################
###################   WHOLE DATASET   #################################
#######################################################################

Xf <- readRDS("XP1.rds")                              # PERIOD 1 (SEASONS 2009-2011)
# Xf <- readRDS("XP2.rds") and run the same script for  PERIOD 2 (SEASONS 2012-2016)

###### EXTRACT VARIABLES OF INTEREST

# isolate outcome
Y <- Xf[,47]

# isolate numeric variables
Xf_DIFF <- Xf[,25:46]

#### standardize variables
Xf_DIFF <- apply(Xf_DIFF, 2, scale)

#### build data.frame
Xf_DIFF <- as.data.frame(Xf_DIFF)
Xf_DIFF <- cbind(Xf_DIFF, Y)



#_________________________________________________________________#
#                       BOOTSTRAP                                 #
#_________________________________________________________________#
###### SET THE SEED AND CREATE LISTS to be saved
set.seed(14081975)

fit.mlog_learn <- list()
y.mlog <- list()
CM.mlogit <- list()
mod <- list()
ad.stats <- list()
vif <- list()


##########################################################################
# BINOMIAL LOGIT MODEL (BLR MODEL BY SEASONS - TABLE 1 & 2 OF THE PAPER) #
##########################################################################

for (i in 1:1000){
  
  trainingRows <- sample(1:nrow(Xf_DIFF), 0.7*nrow(Xf_DIFF))
  learn_DIFF <- Xf_DIFF[trainingRows, ]
  test_DIFF <- Xf_DIFF[-trainingRows, ]
  
  
  fit.mlog_learn[[i]] <- multinom(Y ~ ., data=learn_DIFF)
  y.mlog[[i]] <- predict(fit.mlog_learn[[i]], test_DIFF)
  CM.mlogit[[i]] <- caret::confusionMatrix(y.mlog[[i]], test_DIFF$Y, positive = "win")
  
  
  mod[[i]] <- glm(Y ~ ., family=binomial(logit), data=learn_DIFF)
  vif[[i]] <- vif(glm(Y ~ ., family=binomial(logit), data=learn_DIFF))
  ad.stats[[i]] <- pscl::pR2(mod[[i]])
}






##################################################################################
# BINOMIAL LOGIT MODEL (BLR MODEL BY COUNTRY AND SEASONS - TABLE 3 OF THE PAPER) #
##################################################################################

for (i in unique(Xf$country_name)){
  
  # creates sub-sample based on country_name
  Xf_DIFF <- subset(Xf, country_name == i)
  
  # isolates categorical outcome
  Y <- Xf_DIFF[,47]
  
  # isoliamo le variabili numeriche di interesse
  Xf_DIFF <- Xf_DIFF[,25:46]
  
  #### standardize numeric variables
  Xf_DIFF <- apply(Xf_DIFF, 2, scale)
  
  #### melt all in data.frame
  Xf_DIFF <- as.data.frame(Xf_DIFF)
  Xf_DIFF <- cbind(Xf_DIFF, Y)
for (j in 1:1000){
  
  trainingRows <- sample(1:nrow(Xf_DIFF), 0.7*nrow(Xf_DIFF))
  learn_DIFF <- Xf_DIFF[trainingRows, ]
  test_DIFF <- Xf_DIFF[-trainingRows, ]
  
  
  fit.mlog_learn[[i]][[j]] <- multinom(Y ~ ., data=learn_DIFF)
  y.mlog[[i]][[j]] <- predict(fit.mlog_learn[[i]][[j]], test_DIFF)
  CM.mlogit[[i]][[j]] <- caret::confusionMatrix(y.mlog[[i]][[j]], test_DIFF$Y, positive = "win")
  
  
  mod[[i]][[j]] <- glm(Y ~ ., family=binomial(logit), data=learn_DIFF)
  ad.stats[[i]][[j]] <- pscl::pR2(mod[[i]][[j]])
  
}}





#__________________________________________________________________________#
#                    SECTION 5   -  TAB 4, 5 & 6                           #
#__________________________________________________________________________#

# RE-ADJUSTMENT OF DATA
library(dplyr)
library(elo)
library(lme4)
XP1 <- readRDS("XP1.rds")
XP2 <- readRDS("XP2.rds")

Xf <- rbind(XP1, XP2)

Xf$country_name <- factor(Xf$country_name)
Xf$season <- factor(Xf$season,
                    ordered = T)

# add the ELO variable
Xf$home_team_api_id <- as.character(Xf$home_team_api_id)
Xf$away_team_api_id <- as.character(Xf$away_team_api_id)

# Orders Date match
Xf <- Xf %>% group_by(country_name, season) %>%  arrange(date_match, .by_group = TRUE)

# Elo computation
elo1 <- elo.run(score(home_team_goal, away_team_goal) ~ adjust(home_team_api_id,100)+away_team_api_id, data=Xf, k=30) # adjust prima modificato a 100

elo2 <- as.data.frame(elo1)
elo2$pre.team.A <- elo2$elo.A
elo2$pre.team.B <- elo2$elo.B


for(i in 1:dim(elo2)[1]) { 
  elo2$pre.team.A[i] <- elo2$elo.A[i]-elo2$update[i]	# home team
  elo2$pre.team.B[i] <- elo2$elo.B[i]+elo2$update[i]	# away team
}

Xf_Elo <- data.frame(Xf,elo_home=elo2$pre.team.A,elo_away=elo2$pre.team.B)
Xf_Elo$elo <-Xf_Elo$elo_home - Xf_Elo$elo_away


Xf_Elo[,c(25:46,50)] <- scale(Xf_Elo[,c(25:46,50)])

Xf_Elo <- Xf_Elo[Xf_Elo$date_match > "2010-01-02",]
set.seed(987654321)

Train <- createDataPartition(Xf_Elo$Y, p = .70, list = FALSE)
trainData <- Xf_Elo[ Train, c(3,6, 25:47,50) ]
testData <- Xf_Elo[ -Train, c(3,6,25:47,50) ]



# Missing Data have been imputed with the caret package
# Sample code for imputation
# library(caret)
# preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
# preProcess_missingdata_model
# 
# 
# # Use the imputation model to predict the values of missing data points
# library(RANN)  # required for knnInpute
# trainData <- predict(preProcess_missingdata_model, newdata = trainData)



########
# BLR1 #
########

mod_x <- train(Y ~  Diff_ATT_ATT + Diff_ATT_CEN + Diff_ATT_DEF
               + Diff_SKI_ATT + Diff_SKI_CEN + Diff_SKI_DEF
               + Diff_MOV_ATT + Diff_MOV_CEN + Diff_MOV_DEF
               + Diff_POW_ATT + Diff_POW_CEN + Diff_POW_DEF
               + Diff_MEN_ATT + Diff_MEN_CEN + Diff_MEN_DEF    
               + Diff_DEF_ATT + Diff_DEF_CEN + Diff_DEF_DEF    
               + Diff_MOV_GOK + Diff_POW_GOK + Diff_MEN_GOK    
               + Diff_GOK_GOK ,  data=trainData, method="glm", family="binomial")

Y.mlog <- predict(mod_x, testData)

CM_x <- confusionMatrix(Y.mlog, testData$Y, positive = "win")
print(CM_x, digits = 3)

AIC(mod_x)
Anova(mod_x, type = 3)






########
# BLR2 #
########

mod_x_elo <- train(Y ~  Diff_ATT_ATT + Diff_ATT_CEN + Diff_ATT_DEF
                   + Diff_SKI_ATT + Diff_SKI_CEN + Diff_SKI_DEF
                   + Diff_MOV_ATT + Diff_MOV_CEN + Diff_MOV_DEF
                   + Diff_POW_ATT + Diff_POW_CEN + Diff_POW_DEF
                   + Diff_MEN_ATT + Diff_MEN_CEN + Diff_MEN_DEF    
                   + Diff_DEF_ATT + Diff_DEF_CEN + Diff_DEF_DEF    
                   + Diff_MOV_GOK + Diff_POW_GOK + Diff_MEN_GOK    
                   + Diff_GOK_GOK + elo,  data=trainData, method="glm", family="binomial")


Y.mlog <- predict(mod_x_elo, testData)

CM_x_elo <- confusionMatrix(Y.mlog, testData$Y, positive = "win")
print(CM_x_elo, digits = 3)



AIC(mod_x_elo)
Anova(mod_x_elo, type = 3)





########
# BLR3 #
########

mod_x_re <- glmer(Y ~ 1 + Diff_ATT_ATT + Diff_ATT_CEN + Diff_ATT_DEF
                  + Diff_SKI_ATT + Diff_SKI_CEN + Diff_SKI_DEF
                  + Diff_MOV_ATT + Diff_MOV_CEN + Diff_MOV_DEF
                  + Diff_POW_ATT + Diff_POW_CEN + Diff_POW_DEF
                  + Diff_MEN_ATT + Diff_MEN_CEN + Diff_MEN_DEF    
                  + Diff_DEF_ATT + Diff_DEF_CEN + Diff_DEF_DEF    
                  + Diff_MOV_GOK + Diff_POW_GOK + Diff_MEN_GOK    
                  + Diff_GOK_GOK  +  (1|country_name/season),
                  family ="binomial", data = trainData)


p <- as.numeric(predict(mod_x_re, testData, type="response")>0.5)
mean(p==testData$Y)

tab <- table(p,testData$Y)
rownames(tab) = c("non-win", "win")

CM_x_re <- confusionMatrix(tab, positive = "win")


AIC(mod_x_re)
Anova(mod_x_re, type = 3)





########
# BLR4 #
########
mod_x_elo_re <- glmer(Y ~ 1 + Diff_ATT_ATT + Diff_ATT_CEN + Diff_ATT_DEF
                      + Diff_SKI_ATT + Diff_SKI_CEN + Diff_SKI_DEF
                      + Diff_MOV_ATT + Diff_MOV_CEN + Diff_MOV_DEF
                      + Diff_POW_ATT + Diff_POW_CEN + Diff_POW_DEF
                      + Diff_MEN_ATT + Diff_MEN_CEN + Diff_MEN_DEF    
                      + Diff_DEF_ATT + Diff_DEF_CEN + Diff_DEF_DEF    
                      + Diff_MOV_GOK + Diff_POW_GOK + Diff_MEN_GOK    
                      + Diff_GOK_GOK  + elo + (1|country_name/season),
                      family ="binomial", data = trainData)

print(summary(mod_x_elo_re), digits = 3)


p <- as.numeric(predict(mod_x_elo_re, testData, type="response")>0.5)
mean(p==testData$Y)

tab <- table(p,testData$Y)
rownames(tab) = c("non-win", "win")

CM_x_elo_re <- confusionMatrix(tab, positive = "win")
print(CM_x_elo_re)
CM_x_elo_re

AIC(mod_x_elo_re)
Anova(mod_x_elo_re, type = 3)






###############################################################################
###############################################################################
###########################  BOOTSTRAP 500 REP  ###############################
###############################################################################
###############################################################################
# Adjust according to the model considered

mod <- list()
y.mlog <- list()
CM <- list()

preProcess_missingdata_model <- preProcess(Xf_Elo, method='knnImpute')
preProcess_missingdata_model
Xf_Elo <- predict(preProcess_missingdata_model, newdata = trainData)

for (i in 1:500){
  # building training and test set
  Train <- createDataPartition(Xf_Elo$Y, p = .70, list = FALSE)
  trainData <- Xf_Elo[ Train, c(3,6, 25:47,50) ]
  testData <- Xf_Elo[ -Train, c(3,6,25:47,50) ]
  
  
  mod[[i]] <- glmer(Y ~ 1 + Diff_ATT_ATT + Diff_ATT_CEN + Diff_ATT_DEF
                             + Diff_SKI_ATT + Diff_SKI_CEN + Diff_SKI_DEF
                             + Diff_MOV_ATT + Diff_MOV_CEN + Diff_MOV_DEF
                             + Diff_POW_ATT + Diff_POW_CEN + Diff_POW_DEF
                             + Diff_MEN_ATT + Diff_MEN_CEN + Diff_MEN_DEF    
                             + Diff_DEF_ATT + Diff_DEF_CEN + Diff_DEF_DEF    
                             + Diff_MOV_GOK + Diff_POW_GOK + Diff_MEN_GOK    
                             + Diff_GOK_GOK   + elo + (1|country_name/season),
                             family ="binomial", data = trainData)
  # predict outcome
  y.mlog[[i]] <- as.numeric(predict(mod[[i]], testData, type="response")>0.5)
  mean( y.mlog[[i]]==testData$Y)
  
  tab <- table( y.mlog[[i]],testData$Y)
  rownames(tab) = c("non-win", "win")
  
  # Confusion Matrix
  CM[[i]] <- confusionMatrix(tab, positive = "win")
  
}







######################
# RANDOM FOREST (RF) #
######################

# set bootstrap number
# bootControl <- trainControl(number = 500)

rf_fit <- train( Y ~  + Diff_ATT_ATT + Diff_ATT_CEN + Diff_ATT_DEF
                 + Diff_SKI_ATT + Diff_SKI_CEN + Diff_SKI_DEF
                 + Diff_MOV_ATT + Diff_MOV_CEN + Diff_MOV_DEF
                 + Diff_POW_ATT + Diff_POW_CEN + Diff_POW_DEF
                 + Diff_MEN_ATT + Diff_MEN_CEN + Diff_MEN_DEF
                 + Diff_DEF_ATT + Diff_DEF_CEN + Diff_DEF_DEF
                 + Diff_MOV_GOK + Diff_POW_GOK + Diff_MEN_GOK
                 + Diff_GOK_GOK + elo,
                 data = trainData,
                 method = "ranger")     # trControl = bootControl to increase bootstrap replication

# print(summary(rf_fit), digits = 3)


# predict outcome
rf_pred <- predict(rf_fit, testData)


# Confusion Matrix
print(confusionMatrix(rf_pred, testData$Y, positive = "win"), digits = 3)








#########################
# NEURAL NETWORK (NNET) #
#########################
nnet_fit <- train( Y ~  + Diff_ATT_ATT + Diff_ATT_CEN + Diff_ATT_DEF
                   + Diff_SKI_ATT + Diff_SKI_CEN + Diff_SKI_DEF
                   + Diff_MOV_ATT + Diff_MOV_CEN + Diff_MOV_DEF
                   + Diff_POW_ATT + Diff_POW_CEN + Diff_POW_DEF
                   + Diff_MEN_ATT + Diff_MEN_CEN + Diff_MEN_DEF
                   + Diff_DEF_ATT + Diff_DEF_CEN + Diff_DEF_DEF
                   + Diff_MOV_GOK + Diff_POW_GOK + Diff_MEN_GOK
                   + Diff_GOK_GOK + elo,
                   data = trainData,
                   method = "nnet")

# print(summary(nnet_fit), digits = 3)

# predict outcome
nnet_pred <- predict(nnet_fit, testData)

# Confusion Matrix
print(confusionMatrix(nnet_pred, testData$Y, positive = "win"), digits = 3)






####################
# NAYVE BAYES (NB) #
####################

nb_fit <- train( Y ~  + Diff_ATT_ATT + Diff_ATT_CEN + Diff_ATT_DEF
                 + Diff_SKI_ATT + Diff_SKI_CEN + Diff_SKI_DEF
                 + Diff_MOV_ATT + Diff_MOV_CEN + Diff_MOV_DEF
                 + Diff_POW_ATT + Diff_POW_CEN + Diff_POW_DEF
                 + Diff_MEN_ATT + Diff_MEN_CEN + Diff_MEN_DEF
                 + Diff_DEF_ATT + Diff_DEF_CEN + Diff_DEF_DEF
                 + Diff_MOV_GOK + Diff_POW_GOK + Diff_MEN_GOK
                 + Diff_GOK_GOK + elo,
                 data = trainData,
                 method = "nb")

# print(summary(nb_fit), digits = 3)

# predict outcome
nb_pred <- predict(nb_fit, testData)

# Confusion Matrix
print(confusionMatrix(nb_pred, testData$Y, positive = "win"), digits = 3)